{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "0a2aee25",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-10-31T07:12:50.591931700Z",
     "start_time": "2023-10-31T07:12:49.985177500Z"
    }
   },
   "outputs": [
    {
     "ename": "ImportError",
     "evalue": "cannot import name 'get_full_repo_name' from 'huggingface_hub' (E:\\conda\\envs\\Huggingface_Toturials_1\\lib\\site-packages\\huggingface_hub\\__init__.py)",
     "output_type": "error",
     "traceback": [
      "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m",
      "\u001B[1;31mImportError\u001B[0m                               Traceback (most recent call last)",
      "Cell \u001B[1;32mIn[3], line 1\u001B[0m\n\u001B[1;32m----> 1\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mtransformers\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m BertTokenizer\n\u001B[0;32m      3\u001B[0m \u001B[38;5;66;03m#加载预训练字典和分词方法\u001B[39;00m\n\u001B[0;32m      4\u001B[0m tokenizer \u001B[38;5;241m=\u001B[39m BertTokenizer\u001B[38;5;241m.\u001B[39mfrom_pretrained(\n\u001B[0;32m      5\u001B[0m     pretrained_model_name_or_path\u001B[38;5;241m=\u001B[39m\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mbert-base-chinese\u001B[39m\u001B[38;5;124m'\u001B[39m,\n\u001B[0;32m      6\u001B[0m     cache_dir\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mNone\u001B[39;00m,\n\u001B[0;32m      7\u001B[0m     force_download\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mFalse\u001B[39;00m,\n\u001B[0;32m      8\u001B[0m )\n",
      "File \u001B[1;32mE:\\conda\\envs\\Huggingface_Toturials_1\\lib\\site-packages\\transformers\\__init__.py:26\u001B[0m\n\u001B[0;32m     23\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mtyping\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m TYPE_CHECKING\n\u001B[0;32m     25\u001B[0m \u001B[38;5;66;03m# Check the dependencies satisfy the minimal versions required.\u001B[39;00m\n\u001B[1;32m---> 26\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m dependency_versions_check\n\u001B[0;32m     27\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mutils\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m (\n\u001B[0;32m     28\u001B[0m     OptionalDependencyNotAvailable,\n\u001B[0;32m     29\u001B[0m     _LazyModule,\n\u001B[1;32m   (...)\u001B[0m\n\u001B[0;32m     46\u001B[0m     logging,\n\u001B[0;32m     47\u001B[0m )\n\u001B[0;32m     50\u001B[0m logger \u001B[38;5;241m=\u001B[39m logging\u001B[38;5;241m.\u001B[39mget_logger(\u001B[38;5;18m__name__\u001B[39m)  \u001B[38;5;66;03m# pylint: disable=invalid-name\u001B[39;00m\n",
      "File \u001B[1;32mE:\\conda\\envs\\Huggingface_Toturials_1\\lib\\site-packages\\transformers\\dependency_versions_check.py:16\u001B[0m\n\u001B[0;32m      1\u001B[0m \u001B[38;5;66;03m# Copyright 2020 The HuggingFace Team. All rights reserved.\u001B[39;00m\n\u001B[0;32m      2\u001B[0m \u001B[38;5;66;03m#\u001B[39;00m\n\u001B[0;32m      3\u001B[0m \u001B[38;5;66;03m# Licensed under the Apache License, Version 2.0 (the \"License\");\u001B[39;00m\n\u001B[1;32m   (...)\u001B[0m\n\u001B[0;32m     12\u001B[0m \u001B[38;5;66;03m# See the License for the specific language governing permissions and\u001B[39;00m\n\u001B[0;32m     13\u001B[0m \u001B[38;5;66;03m# limitations under the License.\u001B[39;00m\n\u001B[0;32m     15\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mdependency_versions_table\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m deps\n\u001B[1;32m---> 16\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mutils\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mversions\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m require_version, require_version_core\n\u001B[0;32m     19\u001B[0m \u001B[38;5;66;03m# define which module versions we always want to check at run time\u001B[39;00m\n\u001B[0;32m     20\u001B[0m \u001B[38;5;66;03m# (usually the ones defined in `install_requires` in setup.py)\u001B[39;00m\n\u001B[0;32m     21\u001B[0m \u001B[38;5;66;03m#\u001B[39;00m\n\u001B[0;32m     22\u001B[0m \u001B[38;5;66;03m# order specific notes:\u001B[39;00m\n\u001B[0;32m     23\u001B[0m \u001B[38;5;66;03m# - tqdm must be checked before tokenizers\u001B[39;00m\n\u001B[0;32m     25\u001B[0m pkgs_to_check_at_runtime \u001B[38;5;241m=\u001B[39m [\n\u001B[0;32m     26\u001B[0m     \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mpython\u001B[39m\u001B[38;5;124m\"\u001B[39m,\n\u001B[0;32m     27\u001B[0m     \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mtqdm\u001B[39m\u001B[38;5;124m\"\u001B[39m,\n\u001B[1;32m   (...)\u001B[0m\n\u001B[0;32m     37\u001B[0m     \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mpyyaml\u001B[39m\u001B[38;5;124m\"\u001B[39m,\n\u001B[0;32m     38\u001B[0m ]\n",
      "File \u001B[1;32mE:\\conda\\envs\\Huggingface_Toturials_1\\lib\\site-packages\\transformers\\utils\\__init__.py:18\u001B[0m\n\u001B[0;32m      1\u001B[0m \u001B[38;5;66;03m#!/usr/bin/env python\u001B[39;00m\n\u001B[0;32m      2\u001B[0m \u001B[38;5;66;03m# coding=utf-8\u001B[39;00m\n\u001B[0;32m      3\u001B[0m \n\u001B[1;32m   (...)\u001B[0m\n\u001B[0;32m     15\u001B[0m \u001B[38;5;66;03m# See the License for the specific language governing permissions and\u001B[39;00m\n\u001B[0;32m     16\u001B[0m \u001B[38;5;66;03m# limitations under the License.\u001B[39;00m\n\u001B[1;32m---> 18\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mhuggingface_hub\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m get_full_repo_name  \u001B[38;5;66;03m# for backward compatibility\u001B[39;00m\n\u001B[0;32m     19\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mpackaging\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m version\n\u001B[0;32m     21\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m __version__\n",
      "\u001B[1;31mImportError\u001B[0m: cannot import name 'get_full_repo_name' from 'huggingface_hub' (E:\\conda\\envs\\Huggingface_Toturials_1\\lib\\site-packages\\huggingface_hub\\__init__.py)"
     ]
    }
   ],
   "source": [
    "from transformers import BertTokenizer\n",
    "\n",
    "#加载预训练字典和分词方法\n",
    "tokenizer = BertTokenizer.from_pretrained(\n",
    "    pretrained_model_name_or_path='bert-base-chinese',\n",
    "    cache_dir=None,\n",
    "    force_download=False,\n",
    ")\n",
    "\n",
    "sents = [\n",
    "    '选择珠江花园的原因就是方便。',\n",
    "    '笔记本的键盘确实爽。',\n",
    "    '房间太小。其他的都一般。',\n",
    "    '今天才知道这书还有第6卷,真有点郁闷.',\n",
    "    '机器背面似乎被撕了张什么标签，残胶还在。',\n",
    "]\n",
    "\n",
    "tokenizer, sents"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "286d64c2",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[101, 6848, 2885, 4403, 3736, 5709, 1736, 4638, 1333, 1728, 2218, 3221, 3175, 912, 511, 102, 5011, 6381, 3315, 4638, 7241, 4669, 4802, 2141, 4272, 511, 102, 0, 0, 0]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'[CLS] 选 择 珠 江 花 园 的 原 因 就 是 方 便 。 [SEP] 笔 记 本 的 键 盘 确 实 爽 。 [SEP] [PAD] [PAD] [PAD]'"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#编码两个句子\n",
    "out = tokenizer.encode(\n",
    "    text=sents[0],\n",
    "    text_pair=sents[1],\n",
    "\n",
    "    #当句子长度大于max_length时,截断\n",
    "    truncation=True,\n",
    "\n",
    "    #一律补pad到max_length长度\n",
    "    padding='max_length',\n",
    "    add_special_tokens=True,\n",
    "    max_length=30,\n",
    "    return_tensors=None,\n",
    ")\n",
    "\n",
    "print(out)\n",
    "\n",
    "tokenizer.decode(out)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "e8f221a0",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "input_ids : [101, 6848, 2885, 4403, 3736, 5709, 1736, 4638, 1333, 1728, 2218, 3221, 3175, 912, 511, 102, 5011, 6381, 3315, 4638, 7241, 4669, 4802, 2141, 4272, 511, 102, 0, 0, 0]\n",
      "token_type_ids : [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]\n",
      "special_tokens_mask : [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1]\n",
      "attention_mask : [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]\n",
      "length : 30\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'[CLS] 选 择 珠 江 花 园 的 原 因 就 是 方 便 。 [SEP] 笔 记 本 的 键 盘 确 实 爽 。 [SEP] [PAD] [PAD] [PAD]'"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#增强的编码函数\n",
    "out = tokenizer.encode_plus(\n",
    "    text=sents[0],\n",
    "    text_pair=sents[1],\n",
    "\n",
    "    #当句子长度大于max_length时,截断\n",
    "    truncation=True,\n",
    "\n",
    "    #一律补零到max_length长度\n",
    "    padding='max_length',\n",
    "    max_length=30,\n",
    "    add_special_tokens=True,\n",
    "\n",
    "    #可取值tf,pt,np,默认为返回list\n",
    "    return_tensors=None,\n",
    "\n",
    "    #返回token_type_ids\n",
    "    return_token_type_ids=True,\n",
    "\n",
    "    #返回attention_mask\n",
    "    return_attention_mask=True,\n",
    "\n",
    "    #返回special_tokens_mask 特殊符号标识\n",
    "    return_special_tokens_mask=True,\n",
    "\n",
    "    #返回offset_mapping 标识每个词的起止位置,这个参数只能BertTokenizerFast使用\n",
    "    #return_offsets_mapping=True,\n",
    "\n",
    "    #返回length 标识长度\n",
    "    return_length=True,\n",
    ")\n",
    "\n",
    "#input_ids 就是编码后的词\n",
    "#token_type_ids 第一个句子和特殊符号的位置是0,第二个句子的位置是1\n",
    "#special_tokens_mask 特殊符号的位置是1,其他位置是0\n",
    "#attention_mask pad的位置是0,其他位置是1\n",
    "#length 返回句子长度\n",
    "for k, v in out.items():\n",
    "    print(k, ':', v)\n",
    "\n",
    "tokenizer.decode(out['input_ids'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "1bb964a8",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "input_ids : [[101, 6848, 2885, 4403, 3736, 5709, 1736, 4638, 1333, 1728, 2218, 3221, 3175, 912, 102], [101, 5011, 6381, 3315, 4638, 7241, 4669, 4802, 2141, 4272, 511, 102, 0, 0, 0]]\n",
      "token_type_ids : [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]\n",
      "special_tokens_mask : [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1]]\n",
      "length : [15, 12]\n",
      "attention_mask : [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "('[CLS] 选 择 珠 江 花 园 的 原 因 就 是 方 便 [SEP]',\n",
       " '[CLS] 笔 记 本 的 键 盘 确 实 爽 。 [SEP] [PAD] [PAD] [PAD]')"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#批量编码句子\n",
    "out = tokenizer.batch_encode_plus(\n",
    "    batch_text_or_text_pairs=[sents[0], sents[1]],\n",
    "    add_special_tokens=True,\n",
    "\n",
    "    #当句子长度大于max_length时,截断\n",
    "    truncation=True,\n",
    "\n",
    "    #一律补零到max_length长度\n",
    "    padding='max_length',\n",
    "    max_length=15,\n",
    "\n",
    "    #可取值tf,pt,np,默认为返回list\n",
    "    return_tensors=None,\n",
    "\n",
    "    #返回token_type_ids\n",
    "    return_token_type_ids=True,\n",
    "\n",
    "    #返回attention_mask\n",
    "    return_attention_mask=True,\n",
    "\n",
    "    #返回special_tokens_mask 特殊符号标识\n",
    "    return_special_tokens_mask=True,\n",
    "\n",
    "    #返回offset_mapping 标识每个词的起止位置,这个参数只能BertTokenizerFast使用\n",
    "    #return_offsets_mapping=True,\n",
    "\n",
    "    #返回length 标识长度\n",
    "    return_length=True,\n",
    ")\n",
    "\n",
    "#input_ids 就是编码后的词\n",
    "#token_type_ids 第一个句子和特殊符号的位置是0,第二个句子的位置是1\n",
    "#special_tokens_mask 特殊符号的位置是1,其他位置是0\n",
    "#attention_mask pad的位置是0,其他位置是1\n",
    "#length 返回句子长度\n",
    "for k, v in out.items():\n",
    "    print(k, ':', v)\n",
    "\n",
    "tokenizer.decode(out['input_ids'][0]), tokenizer.decode(out['input_ids'][1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "751e3052",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "input_ids : [[101, 6848, 2885, 4403, 3736, 5709, 1736, 4638, 1333, 1728, 2218, 3221, 3175, 912, 511, 102, 5011, 6381, 3315, 4638, 7241, 4669, 4802, 2141, 4272, 511, 102, 0, 0, 0], [101, 2791, 7313, 1922, 2207, 511, 1071, 800, 4638, 6963, 671, 5663, 511, 102, 791, 1921, 2798, 4761, 6887, 6821, 741, 6820, 3300, 5018, 127, 1318, 117, 4696, 3300, 102]]\n",
      "token_type_ids : [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]\n",
      "special_tokens_mask : [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]]\n",
      "length : [27, 30]\n",
      "attention_mask : [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'[CLS] 选 择 珠 江 花 园 的 原 因 就 是 方 便 。 [SEP] 笔 记 本 的 键 盘 确 实 爽 。 [SEP] [PAD] [PAD] [PAD]'"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#批量编码成对的句子\n",
    "out = tokenizer.batch_encode_plus(\n",
    "    batch_text_or_text_pairs=[(sents[0], sents[1]), (sents[2], sents[3])],\n",
    "    add_special_tokens=True,\n",
    "\n",
    "    #当句子长度大于max_length时,截断\n",
    "    truncation=True,\n",
    "\n",
    "    #一律补零到max_length长度\n",
    "    padding='max_length',\n",
    "    max_length=30,\n",
    "\n",
    "    #可取值tf,pt,np,默认为返回list\n",
    "    return_tensors=None,\n",
    "\n",
    "    #返回token_type_ids\n",
    "    return_token_type_ids=True,\n",
    "\n",
    "    #返回attention_mask\n",
    "    return_attention_mask=True,\n",
    "\n",
    "    #返回special_tokens_mask 特殊符号标识\n",
    "    return_special_tokens_mask=True,\n",
    "\n",
    "    #返回offset_mapping 标识每个词的起止位置,这个参数只能BertTokenizerFast使用\n",
    "    #return_offsets_mapping=True,\n",
    "\n",
    "    #返回length 标识长度\n",
    "    return_length=True,\n",
    ")\n",
    "\n",
    "#input_ids 就是编码后的词\n",
    "#token_type_ids 第一个句子和特殊符号的位置是0,第二个句子的位置是1\n",
    "#special_tokens_mask 特殊符号的位置是1,其他位置是0\n",
    "#attention_mask pad的位置是0,其他位置是1\n",
    "#length 返回句子长度\n",
    "for k, v in out.items():\n",
    "    print(k, ':', v)\n",
    "\n",
    "tokenizer.decode(out['input_ids'][0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "2bbc2994",
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(dict, 21128, False)"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#获取字典\n",
    "zidian = tokenizer.get_vocab()\n",
    "\n",
    "type(zidian), len(zidian), '月光' in zidian,"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "9ddd67b7",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(dict, 21131, 21128, 21130)"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#添加新词\n",
    "tokenizer.add_tokens(new_tokens=['月光', '希望'])\n",
    "\n",
    "#添加新符号\n",
    "tokenizer.add_special_tokens({'eos_token': '[EOS]'})\n",
    "\n",
    "zidian = tokenizer.get_vocab()\n",
    "\n",
    "type(zidian), len(zidian), zidian['月光'], zidian['[EOS]']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "ed7cefad",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[101, 21128, 4638, 3173, 21129, 21130, 102, 0]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'[CLS] 月光 的 新 希望 [EOS] [SEP] [PAD]'"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#编码新添加的词\n",
    "out = tokenizer.encode(\n",
    "    text='月光的新希望[EOS]',\n",
    "    text_pair=None,\n",
    "\n",
    "    #当句子长度大于max_length时,截断\n",
    "    truncation=True,\n",
    "\n",
    "    #一律补pad到max_length长度\n",
    "    padding='max_length',\n",
    "    add_special_tokens=True,\n",
    "    max_length=8,\n",
    "    return_tensors=None,\n",
    ")\n",
    "\n",
    "print(out)\n",
    "\n",
    "tokenizer.decode(out)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
